29 June, 2017
suppressWarnings(library(tidyverse))
suppressWarnings(library(magrittr))
suppressWarnings(library(knitr))
suppressWarnings(library(arsRtools))
unscaled_tss_mat_file <- '/Users/schmidm/Documents/other_people_to_and_from/ClaudiaI/MS_PASHA/unscaled_deeptools_out/Refseqv3_single_isoform_pc_gene_allq_TSSpm5k_matrix.gz'
biotype <- 'pc_allquintiles'
anchor <- 'TSS'
matrix files for ChIPseq all quintiles: /Users/schmidm/Documents/other_people_to_and_from/ClaudiaI/MS_PASHA/unscaled_deeptools_out/Refseqv3_single_isoform_pc_gene_allq_TSSpm5k_matrix.gz
biotype: pc_allquintiles
anchor: TSS
data("chip_samples_table", package = 'arsRtools')
data("Refseq_pc_g2k_q1_second_half_body_scales", package='arsRtools')
data('lineplot_theme', package='arsRtools')
code not run only shown for documentation
This part is not run here, just shown for Documentation. The resulting tidy data frame has been saved and is loaded in the next section.
#!/bin/sh
cd ~/faststorage/ClaudiaI/Pasha_Results/unscaled_wigs
computeMatrix reference-point \
-R "/home/schmidm/annotations/hg19/RefSeqGRCh37.3/Refseq_single_isoform_pc_genes_allquintiles.bed" \
-S *_[P,I][o,n]*.bw \
--referencePoint=TSS --upstream=5000 --downstream=5000 \
--averageTypeBins=sum --missingDataAsZero --binSize=50 \
--outFileName deeptools_out/Refseqv3_single_isoform_pc_gene_allq_TSSpm5k_matrix.gz \
--numberOfProcessors 4
echo "done"
code not run only shown for documentation
bin_values <- arsRtools::load_chip_matrix(unscaled_tss_mat_file, chip_samples_table)
bin_values_filename <- paste0('../data/ChIPseq/ChIPseq_bin_values_', biotype, '_', anchor, '.RData')
Saving bin values to file: ../data/ChIPseq/ChIPseq_bin_values_pc_allquintiles_TSS.RData
save(bin_values, file=bin_values_filename)
everything below is run.
load(bin_values_filename, verbose=TRUE)
bin_values
from Figure S0a Rmd file…
load('../data/ChIPseq/Refseq.3_protein_coding_genes_promoter_ChIP_signal.RData', verbose=TRUE)
pc_ChIP_rel_input
get only quintile information for fusing with bin_values
(pc_ChIP_rel_input %<>%
unite(id, c(gene_id, rna_id, Refseq_id, gene_name), sep=':') %>%
dplyr::select(id, quintile))
(bin_values %<>% left_join(., pc_ChIP_rel_input) %>%
filter(!is.na(quintile)))
BGsub and scaling is normally part of the arsRtools::scale_chip_matrix function. Here the individual code parts are shown:
Note though that quintiles are normally not relevant and dropped in the scale_chip_matrix function.
first split inputs and ips
(inputs <- dplyr::filter(bin_values, ab=='Input') %>%
dplyr::select(id, rel_pos, sample_name, value) %>%
dplyr::rename(used_input = sample_name, Input = value))
(ips <- dplyr::filter(bin_values, ab=='PolII') %>%
dplyr::select(id, quintile, siRNA, ab, main_run, run, rel_pos, sample_name, used_input, value) %>%
dplyr::rename(ChIP = value))
Here we use Refseq_pc_g2k_q1_second_half_body_scales from package arsRtools for scaling
(BGsub <- left_join(ips, inputs) %>%
dplyr::select(id, quintile, siRNA, ab, main_run, run, rel_pos, ChIP, Input) %>%
left_join(., Refseq_pc_g2k_q1_second_half_body_scales) %>%
mutate(scaled_input = Input*input_scale_factor,
BGSub_ChIP = ChIP-scaled_input))
This will result in all ChIP samples having quintile 5 at an average value of ~ 0.
Scaling tries to adjust the ChIP samples from different runs between each other. This is done using the gene body information from quintile 1. One can see the effect quite clearly on TSS-proximal metagene profiles for quintile 1..
(scaled_BGsub <- BGsub %>%
mutate(scaled_BGSub_ChIP = BGSub_ChIP * chip_scale_factor,
siRNA = factor(siRNA, levels=c('siARS2', 'siFFL', 'siRRP40', 'siZ18')),
run = factor(run, levels=c('CI62', 'CI63',
'MA72_z18', 'MA73_z18',
'MA72_rrp40', 'MA73_rrp40')),
main_run = factor(main_run, levels=c('CI siARS2',
'MA siZ18',
'MA siRRP40'))) %>%
dplyr::select(id, quintile, main_run, run, siRNA, rel_pos, scaled_BGSub_ChIP))
This will result in all ChIP samples having gene body values, ie TSS-distal region of quintile 1 at average at a similar value of ~ 1
save(scaled_BGsub, file='../data/Refseq.3_protein_coding_genes_ChIP_scaled_rel2ndhalf_BGsub_values_relTSS.RData')
everything below is run
load('../data/Refseq.3_protein_coding_genes_ChIP_scaled_rel2ndhalf_BGsub_values_relTSS.RData', verbose=TRUE)
## Loading objects:
## scaled_BGsub
scaled_BGsub
## # A tibble: 33,297,600 × 7
## id quintile main_run run
## <chr> <fctr> <fctr> <fctr>
## 1 gene1:rna0:XM_003403543.1:LOC100652771 4 CI siARS2 CI62
## 2 gene7:rna4:NM_001005484.1:OR4F5 4 CI siARS2 CI62
## 3 gene13:rna7:XM_002344251.2:LOC100288646 4 CI siARS2 CI62
## 4 gene14:rna8:NM_001005221.2:OR4F29 5 CI siARS2 CI62
## 5 gene17:rna10:NM_001005277.1:OR4F16 4 CI siARS2 CI62
## 6 gene18:rna11:XM_002342011.2:LOC100287654 4 CI siARS2 CI62
## 7 gene31:rna22:NM_152486.2:SAMD11 3 CI siARS2 CI62
## 8 gene32:rna23:NM_015658.3:NOC2L 3 CI siARS2 CI62
## 9 gene33:rna24:NM_198317.2:KLHL17 3 CI siARS2 CI62
## 10 gene38:rna30:NM_005101.3:ISG15 3 CI siARS2 CI62
## # ... with 33,297,590 more rows, and 3 more variables: siRNA <fctr>,
## # rel_pos <dbl>, scaled_BGSub_ChIP <dbl>
get gene length info
load('../data/annotations/Refseq_single_isoform_pc_genes.RData', verbose=TRUE)
## Loading objects:
## mRNA_genes
mRNA_genes
## # A tibble: 13,944 × 8
## chr start end gene_id rna_id Refseq_id score strand
## * <fctr> <int> <int> <chr> <chr> <chr> <fctr> <fctr>
## 1 chr1 12190 13639 gene1 rna0 XM_003403543.1 . +
## 2 chr1 69091 70008 gene7 rna4 NM_001005484.1 . +
## 3 chr1 329790 342507 gene13 rna7 XM_002344251.2 . +
## 4 chr1 367659 368597 gene14 rna8 NM_001005221.2 . +
## 5 chr1 621096 622034 gene17 rna10 NM_001005277.1 . -
## 6 chr1 647187 659924 gene18 rna11 XM_002342011.2 . -
## 7 chr1 861121 879961 gene31 rna22 NM_152486.2 . +
## 8 chr1 879583 894679 gene32 rna23 NM_015658.3 . -
## 9 chr1 895967 901099 gene33 rna24 NM_198317.2 . +
## 10 chr1 948847 949920 gene38 rna30 NM_005101.3 . +
## # ... with 13,934 more rows
(gene_len_df <- mRNA_genes %>%
mutate(gene_length = end -start) %>%
dplyr::select(gene_id, gene_length))
## # A tibble: 13,944 × 2
## gene_id gene_length
## <chr> <int>
## 1 gene1 1449
## 2 gene7 917
## 3 gene13 12717
## 4 gene14 938
## 5 gene17 938
## 6 gene18 12737
## 7 gene31 18840
## 8 gene32 15096
## 9 gene33 5132
## 10 gene38 1073
## # ... with 13,934 more rows
(scaled_BGsub %<>%
mutate(gene_id = sub(':.*', '', id)) %>%
left_join(., gene_len_df) %>%
filter(gene_length > 7000))
## # A tibble: 22,644,000 × 9
## id quintile main_run run
## <chr> <fctr> <fctr> <fctr>
## 1 gene13:rna7:XM_002344251.2:LOC100288646 4 CI siARS2 CI62
## 2 gene18:rna11:XM_002342011.2:LOC100287654 4 CI siARS2 CI62
## 3 gene31:rna22:NM_152486.2:SAMD11 3 CI siARS2 CI62
## 4 gene32:rna23:NM_015658.3:NOC2L 3 CI siARS2 CI62
## 5 gene39:rna31:NM_198576.2:AGRN 5 CI siARS2 CI62
## 6 gene41:rna33:NM_017891.4:C1orf159 3 CI siARS2 CI62
## 7 gene54:rna53:NM_001130413.3:SCNN1D 4 CI siARS2 CI62
## 8 gene55:rna55:NM_030649.2:ACAP3 1 CI siARS2 CI62
## 9 gene57:rna57:NM_017871.4:CPSF3L 1 CI siARS2 CI62
## 10 gene60:rna60:NM_004421.2:DVL1 1 CI siARS2 CI62
## # ... with 22,643,990 more rows, and 5 more variables: siRNA <fctr>,
## # rel_pos <dbl>, scaled_BGSub_ChIP <dbl>, gene_id <chr>,
## # gene_length <int>
scaled_BGsub_metagene <- scaled_BGsub %>%
group_by(siRNA, run, main_run, rel_pos, quintile) %>%
summarize(mean_scaled_BGSub_value = mean(scaled_BGSub_ChIP))
scaled_BGsub_metagene %>%
filter(!is.na(quintile)) %>%
ggplot(.) +
geom_line(aes(x=rel_pos, y=mean_scaled_BGSub_value, color=siRNA), alpha=.7) +
facet_grid(quintile ~ run) +
lineplot_theme +
xlim(-2000,5000) +
ylab('scaled BGsub ChIP value')
showing both replicates together with area in between colored
scaled_BGsub_metagene %>%
group_by(quintile, siRNA, main_run, rel_pos) %>%
mutate(upper = max(mean_scaled_BGSub_value), lower = min(mean_scaled_BGSub_value), center = mean(mean_scaled_BGSub_value)) %>%
ggplot(.) +
geom_line(aes(x=rel_pos, y=mean_scaled_BGSub_value, color=siRNA), alpha=.7) +
geom_ribbon(aes(x=rel_pos, ymin=lower, ymax=upper, fill=siRNA), alpha=.7 ) +
facet_grid(quintile~main_run) +
lineplot_theme +
xlim(-2000,5000) +
ylab('scaled BGsub ChIP value')
min_val <- min(scaled_BGsub$scaled_BGSub_ChIP[scaled_BGsub$scaled_BGSub_ChIP>0])
scaled_BGsub_ratio <- scaled_BGsub %>%
mutate(KD = ifelse(siRNA == 'siFFL', 'siFFL', 'KD'),
scaled_BGSub_ChIP = ifelse(scaled_BGSub_ChIP <= 0, min_val, scaled_BGSub_ChIP+min_val)) %>%
dplyr::select(id, quintile, rel_pos, KD, main_run, scaled_BGSub_ChIP) %>%
group_by(KD, id, quintile, main_run, KD, rel_pos) %>%
summarize(scaled_BGSub_ChIP = mean(scaled_BGSub_ChIP)) %>%
spread(KD, scaled_BGSub_ChIP) %>%
mutate(log2_ratio = log2(KD)-log2(siFFL) )
data('RNAseq_logFC_heatmap_theme', package='arsRtools')
p <- scaled_BGsub_ratio %>%
filter(!is.na(log2_ratio),
rel_pos > -2000, rel_pos < 5000) %>%
ungroup() %>%
mutate(log2_ratio = case_when(.$log2_ratio > 2 ~ 2,
.$log2_ratio < -2 ~ -2,
TRUE ~ .$log2_ratio)) %>%
ggplot(., aes(x=rel_pos, y=id, fill=log2_ratio)) +
geom_raster(interpolate = FALSE) +
facet_grid(quintile~main_run, scales='free') +
RNAseq_logFC_heatmap_theme +
theme(axis.text.y = element_blank())
p +
ylab('log2FC based on values scaled to pc gene body')
all_zero_ids <- scaled_BGsub_ratio %>%
group_by(id) %>%
summarize(min_value = min(log2_ratio),
max_value = max(log2_ratio)) %>%
filter(min_value == 0, max_value == 0) %$%
id
p <- scaled_BGsub_ratio %>%
filter(!is.na(log2_ratio),
!(id %in% all_zero_ids),
rel_pos > -2000, rel_pos < 5000) %>%
ungroup() %>%
mutate(log2_ratio = case_when(.$log2_ratio > 2 ~ 2,
.$log2_ratio < -2 ~ -2,
TRUE ~ .$log2_ratio)) %>%
ggplot(., aes(x=rel_pos, y=id, fill=log2_ratio)) +
geom_raster(interpolate = FALSE) +
facet_grid(quintile~main_run, scales='free') +
RNAseq_logFC_heatmap_theme +
theme(axis.text.y = element_blank())
p +
ylab('log2FC based on values scaled to pc gene body')
sessionInfo()
## R version 3.2.3 (2015-12-10)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.11.6 (El Capitan)
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] arsRtools_0.1 RColorBrewer_1.1-2 rtracklayer_1.30.4
## [4] GenomicRanges_1.22.4 GenomeInfoDb_1.6.3 IRanges_2.4.8
## [7] S4Vectors_0.8.11 BiocGenerics_0.16.1 knitr_1.15.1
## [10] magrittr_1.5 dplyr_0.5.0 purrr_0.2.2
## [13] readr_1.0.0 tidyr_0.6.0 tibble_1.2
## [16] ggplot2_2.2.0 tidyverse_1.0.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.8 futile.logger_1.4.3
## [3] plyr_1.8.4 XVector_0.10.0
## [5] futile.options_1.0.0 bitops_1.0-6
## [7] tools_3.2.3 zlibbioc_1.16.0
## [9] digest_0.6.10 evaluate_0.10
## [11] gtable_0.2.0 DBI_0.5-1
## [13] yaml_2.1.14 stringr_1.1.0
## [15] Biostrings_2.38.4 rprojroot_1.1
## [17] grid_3.2.3 Biobase_2.30.0
## [19] R6_2.2.0 BiocParallel_1.4.3
## [21] XML_3.98-1.5 rmarkdown_1.2
## [23] reshape2_1.4.2 lambda.r_1.1.9
## [25] GenomicAlignments_1.6.3 Rsamtools_1.22.0
## [27] backports_1.0.4 scales_0.4.1
## [29] htmltools_0.3.5 SummarizedExperiment_1.0.2
## [31] assertthat_0.1 colorspace_1.3-2
## [33] labeling_0.3 stringi_1.1.2
## [35] RCurl_1.95-4.8 lazyeval_0.2.0
## [37] munsell_0.4.3